Background

The purpose of this workflow is to determine co-variates to include in DEG analysis.

Setup

Load packages

# Data manipulation and figures
library(tidyverse)
    # Multi-panel figures for ggplot
    library(cowplot)
library(venn)
library(biomaRt)
# Print tty table to knit file
library(knitr)
library(kableExtra)

`%notin%` <- Negate(`%in%`)

Set seed

set.seed(4389)

Customization

Set variable names and cutoffs for this workflow.

#Rdata file WITHIN project directory that holds cleaned data
data.file <- "data_clean/RSTR_RNAseq_data_combined_uniqueFULLID.RData"

#Prefix to give file names
basename <- "RSTR.Mtb"
#Define variable(s) of interest
#Used in PCA plots and to select significant genes to be used in module building
vars_of_interest <- c("condition", "Sample_Group")

Load data

#Load data
load(data.file)

This includes in the following samples.

## Loading required package: limma
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(vars_of_interest)` instead of `vars_of_interest` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
condition Sample_Group n
MEDIA LTBI 52
MEDIA RSTR 49
TB LTBI 52
TB RSTR 49

Kinship

kin <- read_csv("data_raw/kinship_Hawn_all.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   rowname = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
overlap <- intersect(kin$rowname, meta.combined$FULLIDNO)

kin <- kin %>% 
  filter(rowname %in% overlap) %>% 
  dplyr::select(rowname, all_of(overlap)) %>% 
  arrange(rowname) %>% 
  column_to_rownames()

#Subset RNAseq data
dat.combined.voom.kin <- dat.combined.voom
dat.combined.voom.kin$targets <- dat.combined.voom.kin$targets %>% 
  filter(FULLIDNO %in% overlap)
dat.combined.voom.kin$E <- dat.combined.voom.kin$E %>% 
  dplyr::select(all_of(c(dat.combined.voom.kin$targets$libID)))

Thus, kinship models include the following samples.

condition Sample_Group n
MEDIA LTBI 49
MEDIA RSTR 43
TB LTBI 49
TB RSTR 43

Data exploration

PCA

Main variables

Potential co-variates

Interaction models

Linear mixed effects models

source("scripts/kin.model.fxn.R")

Samples with kinship

lmekin.loop(dat = dat.combined.voom, kin = kin, 
             x.var = c("condition","Sample_Group"), ptID="FULLIDNO",
             co.var=c("KCHCA_AGE_YR_CURRENT","M0_KCVSEX","experiment"), 
             interaction=TRUE, 
             lm=FALSE, lme=TRUE,
             outdir="results/model_selection/",
             name="RSTR.interaction_age.sex.batch",
             processors=5, p.method="BH")

lmekin.loop(dat = dat.combined.voom, kin = kin, 
             x.var = c("condition","Sample_Group"), ptID="FULLIDNO",
             co.var=c("M0_KCVSEX","experiment"), 
             interaction=TRUE, 
             lm=FALSE, lme=TRUE,
             outdir="results/model_selection/",
             name="RSTR.interaction_sex.batch",
             processors=5, p.method="BH")

lmekin.loop(dat = dat.combined.voom, kin = kin, 
             x.var = c("condition","Sample_Group"), ptID="FULLIDNO",
             co.var=c("experiment"), 
             interaction=TRUE, 
             lm=FALSE, lme=TRUE,
             outdir="results/model_selection/", 
             name="RSTR.interaction_batch",
             processors=5, p.method="BH")
model.files <- list.files(path="results/model_selection", full.names = TRUE,)
model.files <- model.files[grepl("batch.model.results.csv.gz", model.files)]
#load results
gene_pval <- data.frame()

for(filename in model.files){
  gene_pval <- read_csv(filename) %>% 
    bind_rows(gene_pval)
}

Format model result labels

gene_pval_all <- gene_pval%>% 
  mutate(CHR = "allCHR", samples="kinSAMPLE") %>% 
  mutate(coVar = gsub("RSTR.interaction_","", group),
         coVar = gsub("_all","", coVar)) %>% 
  mutate(coVar = factor(coVar, levels =c("age.sex.batch","sex.batch","batch")))

Model fits (sigma)

Compare models from the same sample set, including only samples with RNA-seq and kinship data (N = 92).

Kinship

Co-variates CHR Model Number of genes best fit Mean difference in sigma
age.sex.batch allCHR lme 2820 0.0065457
allCHR lmekin 11152 0.0084771
sex.batch allCHR lme 3044 0.0064979
allCHR lmekin 10928 0.0081801
batch allCHR lme 3156 0.0065575
allCHR lmekin 10816 0.0078584

Overall fits are similar between models with and without kinship correction, as seen by values along the 1:1 line. However, the majority of genes are better fit to a small degree (lower sigma) by the kinship model.

Age

CHR Co-variates Number of genes best fit Mean difference in sigma
allCHR age.sex.batch 9309 0.0009389
sex.batch 4663 0.0005466

The best fit model with or without age is split by genes.

Sex

CHR Co-variates Number of genes best fit Mean difference in sigma
allCHR batch 5718 0.0006842
sex.batch 8254 0.0011238

Similarly, there is variability in best fit with and without sex.

Significant genes

Compare models from the maximum sample sets. Kinship models contain 92 individuals while non-kinship contain 101. Age is significant for 0 genes. Thus, age is not included in venns.

## quartz_off_screen 
##                 2

Only main terms.

## quartz_off_screen 
##                 2

DEGs (significant for interaction or RSTR)

## Joining, by = c("samples", "model", "CHR", "coVar")
DEG FDR < 0.2
samples model CHR coVar Mtb:RSTR interaction RSTR Total
kinSAMPLE lme allCHR age.sex.batch 191 7 195
sex.batch 191 6 194
batch 191 7 195
lmekin age.sex.batch 260 0 260
sex.batch 253 0 253
batch 254 1 255

R session

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] limma_3.44.3     kableExtra_1.3.1 knitr_1.30       biomaRt_2.44.4  
##  [5] venn_1.9         cowplot_1.1.1    forcats_0.5.0    stringr_1.4.0   
##  [9] dplyr_1.0.3      purrr_0.3.4      readr_1.4.0      tidyr_1.1.2     
## [13] tibble_3.0.5     ggplot2_3.3.3    tidyverse_1.3.0 
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.5.0             lubridate_1.7.9.2    bit64_4.0.5         
##  [4] webshot_0.5.2        progress_1.2.2       httr_1.4.2          
##  [7] tools_4.0.2          backports_1.2.1      R6_2.5.0            
## [10] DBI_1.1.1            BiocGenerics_0.34.0  colorspace_2.0-0    
## [13] withr_2.4.0          tidyselect_1.1.0     prettyunits_1.1.1   
## [16] bit_4.0.4            curl_4.3             compiler_4.0.2      
## [19] cli_2.2.0            rvest_0.3.6          Biobase_2.48.0      
## [22] xml2_1.3.2           labeling_0.4.2       scales_1.1.1        
## [25] askpass_1.1          rappdirs_0.3.1       digest_0.6.27       
## [28] rmarkdown_2.6        pkgconfig_2.0.3      htmltools_0.5.1     
## [31] dbplyr_2.0.0         highr_0.8            rlang_0.4.10        
## [34] readxl_1.3.1         rstudioapi_0.13      RSQLite_2.2.2       
## [37] generics_0.1.0       farver_2.0.3         jsonlite_1.7.2      
## [40] magrittr_2.0.1       Rcpp_1.0.6           munsell_0.5.0       
## [43] S4Vectors_0.26.1     fansi_0.4.2          lifecycle_0.2.0     
## [46] stringi_1.5.3        yaml_2.2.1           BiocFileCache_1.12.1
## [49] grid_4.0.2           blob_1.2.1           parallel_4.0.2      
## [52] crayon_1.3.4         haven_2.3.1          hms_1.0.0           
## [55] pillar_1.4.7         admisc_0.11          stats4_4.0.2        
## [58] reprex_0.3.0         XML_3.99-0.5         glue_1.4.2          
## [61] evaluate_0.14        modelr_0.1.8         vctrs_0.3.6         
## [64] selectr_0.4-2        cellranger_1.1.0     gtable_0.3.0        
## [67] openssl_1.4.3        assertthat_0.2.1     xfun_0.20           
## [70] broom_0.7.3          viridisLite_0.3.0    AnnotationDbi_1.50.3
## [73] memoise_1.1.0        IRanges_2.22.2       ellipsis_0.3.1